##################################################################
##################################################################
## Replication Material
## Stefan Müller: The Temporal Focus of Campaign Communication
## The Journal of Politics
## stefan.mueller@ucd.ie
##
## Script 2: Classify party manifestos
##################################################################
##################################################################

# Note: The file description_replication_material_jop_mueller.pdf describes the purpose of this 
# file in detail and lists the names and sources of all datasets 
# used in this script

# This script was run on the following R version, platform and OS:
# R version 3.6.0 (2019-04-26)
# Platform: Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalima 10.15.5

# load packages required to run this script
library(dplyr)               # CRAN v1.0.0                
library(stringr)             # CRAN v1.4.0 
library(quanteda)            # CRAN v2.0.1             
library(quanteda.textmodels) # CRAN v0.9.1 
library(spacyr)              # CRAN v1.2.1   

# increase number of threads used for quanteda operations 
# (10 threads requires 32 GB RAM)
quanteda_options(threads = 10)

# load the merged dataset created in the previous script
dat_merged <- readRDS("data_merged.rds")

# select relevant English-speaking countries

countries_en_select <- c("Australia", "Canada", 
                         "Ireland", "United Kingdom",
                         "New Zealand", "United States")


# get the not annotated English manifestos
dat_merged_not_annotated_en <- dat_merged %>% 
    filter(annotations == "FALSE") %>% 
    filter(countryname %in% countries_en_select & language == "english")

table(dat_merged_not_annotated_en$countryname)

# replace some patterns
dat_merged_not_annotated_en <- dat_merged_not_annotated_en %>% 
    mutate(text = str_replace_all(text, " -", " •"))

# transform to corpus and segment to sentences
corpus_en_not_annotated <- corpus(dat_merged_not_annotated_en, 
                                  text_field = "text") %>%
    corpus_segment(pattern = "[.?!•]", valuetype = "regex",
                   extract_pattern = FALSE,
                   pattern_position = "after")

# check whether sentence segmentation works as expected
txt <- c(d1 = "This, is a sentence •  You: come here but lets see–adsf. -Let's go",
         d2 = "Yes, yes okay.")
char_segment(txt, pattern = "[.?!•]", valuetype = "regex",
             pattern_position = "after", remove_pattern = FALSE)


# add docvar with number of tokens per sentence (but do not count punctuation as a word)
docvars(corpus_en_not_annotated, "ntoken") <- ntoken(corpus_en_not_annotated, remove_punct = TRUE)


# use only sentences longer than 4 tokens
corpus_en_not_annotated_subset <- corpus_subset(corpus_en_not_annotated, 
                                                ntoken > 4)

# get annotated English manifestos, and transform to sentence-level
dat_merged_annotated_en <- dat_merged %>% 
    filter(annotations == "TRUE" & !is.na(cmp_code)) %>% 
    filter(countryname %in% countries_en_select & language == "english")


                             # create a text corpus (no segmentation required because corpus is 
# also on the level of quasi-sentences)
corpus_en_annotated <- corpus(dat_merged_annotated_en, 
                              text_field = "text")

# docvar with number of tokens per sentence (punctuation does not count as a word)
docvars(corpus_en_annotated, "ntoken") <- ntoken(corpus_en_annotated, remove_punct = TRUE)

# use only sentences longer than 4 tokens
corpus_en_annotated_subset <- corpus_subset(corpus_en_annotated, 
                                            ntoken > 4)

# get annotated German manifestos 
countries_ger_select <- c("Germany", "Austria", "Switzerland")

dat_merged_annotated_ger <- dat_merged %>% 
    filter(annotations == "TRUE" & !is.na(cmp_code)) %>% 
    filter(countryname %in% countries_ger_select & language == "german")

                             # create a text corpus
corpus_ger_annotated <- corpus(dat_merged_annotated_ger, text_field = "text")

# docvar with number of tokens per sentence
docvars(corpus_ger_annotated, "ntoken") <- ntoken(corpus_ger_annotated,
                                                  remove_punct = TRUE)

# use only sentences longer than 4 tokens
corpus_ger_annotated_subset <- corpus_subset(corpus_ger_annotated, 
                                             ntoken > 4)

# get the not annotated German manifestos
dat_merged_not_annotated_ger <- dat_merged %>% 
    filter(annotations == "FALSE") %>% 
    filter(countryname %in% countries_ger_select & language == "german")

# replace pattern for sentence segmentation
dat_merged_not_annotated_ger <- dat_merged_not_annotated_ger %>% 
    mutate(text = str_replace_all(text, " -", " •"))

# transform to corpus and segment to sentence level
corpus_ger_not_annotated <- corpus(dat_merged_not_annotated_ger, 
                                   text_field = "text") %>% 
    corpus_segment(pattern = "[.?!•]", valuetype = "regex", 
                   extract_pattern = FALSE,
                   pattern_position = "after")


# add docvar with number of tokens per sentence
docvars(corpus_ger_not_annotated, "ntoken") <- ntoken(corpus_ger_not_annotated, 
                                                      remove_punct = TRUE)

# use only sentences longer than 4 tokens
corpus_ger_not_annotated_subset <- corpus_subset(corpus_ger_not_annotated, 
                                                 ntoken > 4)

# load coded statements
dat_classified_english <- readRDS("data_sentences_classified_english.rds")
dat_classified_german <- readRDS("data_sentences_classified_german.rds")



                             # create document-feature matrices with classified German and 
# English training sets
dfmat_coded_en <- dat_classified_english %>% 
    corpus() %>% 
    dfm()

dfmat_coded_ger <- dat_classified_german %>% 
    corpus() %>% 
    dfm()

# now use textmodel_svm (from quanteda.textmodels - at some stage the package might include an 
# updated implementation - please use v.0.9.1 to reproduce the classification)
# and train the models
tmod_ger_svm_class <- textmodel_svm(dfmat_coded_ger,
                                    docvars(dfmat_coded_ger, "class"))

tmod_en_svm_class <- textmodel_svm(dfmat_coded_en,
                                   docvars(dfmat_coded_en, "class"))


                             # create document-feature matrices for classification

dfm_ger_not_annotated_subset <- dfm(corpus_ger_not_annotated_subset, 
                                    remove_punct = TRUE,
                                    tolower = TRUE)

dfm_ger_annotated_subset <- dfm(corpus_ger_annotated_subset, 
                                remove_punct = TRUE,
                                tolower = TRUE)

dfm_en_not_annotated_subset <- dfm(corpus_en_not_annotated_subset, 
                                   remove_punct = TRUE,
                                   tolower = TRUE)

dfm_en_annotated_subset <- dfm(corpus_en_annotated_subset, 
                               remove_punct = TRUE,
                               tolower = TRUE)


# predict class in each sentence and return the probability for each class
pred_svm_en_annotated_class <- predict(tmod_en_svm_class,
                                       dfm_en_annotated_subset, 
                                       type = "probability")

pred_svm_en_not_annotated_class <- predict(tmod_en_svm_class,
                                           dfm_en_not_annotated_subset, 
                                           type = "probability")

pred_svm_ger_annotated_class <- predict(tmod_ger_svm_class,
                                        dfm_ger_annotated_subset, 
                                        type = "probability")

pred_svm_ger_not_annotated_class <- predict(tmod_ger_svm_class,
                                            dfm_ger_not_annotated_subset, 
                                            type = "probability")

# load dictionaries 
# (for Copyright reasons, LIWC dictionaries cannot be uplaoded to Dataverse)

# file not included in the Dataverse!
liwc_english <- dictionary(file = "../data_notshare/LIWC2007_English.dic")
liwc_english <- liwc_english[c("posemo", "negemo", "past", "present", "future", "certain", "tentat")]

# file not included in the Dataverse!
liwc_english_15 <- dictionary(file = "../data_notshare/LIWC2015_English_Flat.dic")
liwc_english_15 <- liwc_english_15[c("posemo", "negemo", "focuspast", "focuspresent", "focusfuture")]

# file not included in the Dataverse!
liwc_german <- dictionary(file = "../data_notshare/LIWC2001_German_UTF8.dic")
liwc_german <- liwc_german[c("Posemo", "Negemo", "Certain", "Tentat", "Past", "Present", "Future")]

# load translated LSD dictionary
data_dictionary_lsdgerman <- readRDS("data_dictionary_lsdgerman.rds")

# load Rauh's sentiment dictionary
data_dictionary_rauh <- readRDS("data_dictionary_rauh.rds")

# function to apply all English dictionaries at once (input is a corpus)
apply_dictionaries_en <- function(x) {
    
    toks <- tokens(x)
    
    dict_results_lsd <- x %>% 
        tokens_remove(pattern = "ireland*") %>% # would be scored as a negative term...
        tokens_lookup(data_dictionary_LSD2015,
                                      nested_scope = "dictionary") %>% 
        dfm() %>% 
        convert(to = "data.frame") %>% 
        rename(positive_lsd = positive,
               negative_lsd = negative,
               neg_negative_lsd = neg_negative,
               neg_positive_lsd = neg_positive)
    
    dict_results_liwc <- tokens_lookup(x, liwc_english,
                                       nested_scope = "dictionary") %>% 
        dfm() %>% 
        convert(to = "data.frame") %>% 
        rename(positive_liwc = posemo,
               negative_liwc = negemo,
               present_liwc = present,
               future_liwc = future,
               past_liwc = past,
               certain_liwc = certain,
               tentative_liwc = tentat)
    
    dict_results_liwc_15 <- tokens_lookup(x, liwc_english_15,
                                          nested_scope = "dictionary") %>% 
        dfm() %>% 
        convert(to = "data.frame") %>% 
        rename(positive_liwc15 = posemo,
               negative_liwc15 = negemo,
               past_liwc15 = focuspast,
               present_liwc15 = focuspresent,
               future_liwc15 = focusfuture)
    
    dict_results_df <- bind_cols(
        dict_results_lsd, 
        dict_results_liwc,
        dict_results_liwc_15,
    )
    
}

# function to apply all German dictionaries at once (input is a corpus)
apply_dictionaries_ger <- function(x) {
    
    dict_results_liwc <- tokens_lookup(x, liwc_german,
                                       nested_scope = "dictionary") %>% 
        dfm() %>% 
        convert(to = "data.frame") %>% 
        rename(positive_liwc = posemo,
               negative_liwc = negemo)
    
    dict_results_rauh <- x %>% 
      tokens_replace(pattern = c("nicht", "nichts", "kein",
                                       "keine", "keinen"),
                     replacement = rep("not", 5)) %>% 
      tokens_lookup(data_dictionary_rauh,
                    nested_scope = "dictionary") %>%
      dfm() %>% 
      convert(to = "data.frame") %>% 
      rename(positive_rauh = positive,
               negative_rauh = negative,
               neg_positive_rauh = neg_positive,
               neg_negative_rauh = neg_negative)
    
    dict_results_lsd_de <- tokens_lookup(x, data_dictionary_lsdgerman,
                                         nested_scope = "dictionary") %>%
        dfm() %>% 
        convert(to = "data.frame") %>% 
        rename(positive_lsd = pos,
               negative_lsd = neg)
    
    dict_results_df <- bind_cols(
        dict_results_liwc,
        dict_results_rauh,
        dict_results_lsd_de)
    
}

# the next steps apply the sentiment dictionaries and create a data frame 
# that contains the document-level variables, the texts, the predictions (probability)
# and the dictionary results

dat_en_not_annotated_full <- apply_dictionaries_en(x = tokens(corpus_en_not_annotated_subset)) %>%
    bind_cols(as.data.frame(pred_svm_en_not_annotated_class)) %>% 
    mutate(text = texts(corpus_en_not_annotated_subset)) %>% 
    bind_cols(docvars(corpus_en_not_annotated_subset)) 

dat_en_annotated_full <- apply_dictionaries_en(x = tokens(corpus_en_annotated_subset)) %>%
    bind_cols(as.data.frame(pred_svm_en_annotated_class)) %>% 
    mutate(text = texts(corpus_en_annotated_subset)) %>% 
    bind_cols(docvars(corpus_en_annotated_subset)) 

dat_ger_annotated_full <- apply_dictionaries_ger(x = tokens(corpus_ger_annotated_subset)) %>%
    bind_cols(as.data.frame(pred_svm_ger_annotated_class)) %>% 
    mutate(text = texts(corpus_ger_annotated_subset)) %>% 
    bind_cols(docvars(corpus_ger_annotated_subset)) 

dat_ger_not_annotated_full <- apply_dictionaries_ger(x = tokens(corpus_ger_not_annotated_subset)) %>%
    bind_cols(as.data.frame(pred_svm_ger_not_annotated_class)) %>% 
    mutate(text = texts(corpus_ger_not_annotated_subset)) %>% 
    bind_cols(docvars(corpus_ger_not_annotated_subset)) 


# combine all data frame and use text as the first column
dat_combined <- bind_rows(dat_ger_annotated_full,
                          dat_ger_not_annotated_full,
                          dat_en_annotated_full,
                          dat_en_not_annotated_full) %>% 
    dplyr::select(text, everything())


# apply part of speech taggers and only filter sentences that include a noun
                             # create a unique ID for each document (i.e. sentence)

dat_combined <- dat_combined %>% 
    group_by(manifesto_id) %>% 
    mutate(doc_id = paste(manifesto_id, 1:n(), sep = "_")) %>% 
    ungroup() 

# filter English documents first
dat_combined_en <- filter(dat_combined, language == "english")

# check that each doc_id is unique!
stopifnot(nrow(dat_combined) == length(unique(dat_combined$doc_id)))


# initialise English language model from spacy
# instructions regarding installation: http://spacyr.quanteda.io

spacy_initialize(model = "en")

# apply part-of-speech tagger
dat_combined_en_postagged <- spacy_parse(dat_combined_en, 
                                         pos = TRUE, lemma = FALSE,
                                         entity = FALSE)


# only filter documents that contain a verb
dat_combined_en_postagged_verbs <- dat_combined_en_postagged %>% 
  filter(pos == "VERB") %>% 
  select(doc_id) %>% 
  unique()

# finalize spacy, then load the German model 
# and repeat steps with German data
spacy_finalize() 

## requires downloading the German language model
# spacy_download_langmodel(model = "de")
spacy_initialize(model = "de")

dat_combined_de <- filter(dat_combined, language == "german")

dat_combined_de_postagged <- spacy_parse(dat_combined_de, 
                                         pos = TRUE, lemma = FALSE,
                                         entity = FALSE)

dat_combined_de_postagged_verbs <- dat_combined_de_postagged %>% 
  filter(pos == "VERB") %>% 
  select(doc_id) %>% 
  unique()

dat_combined_postagged_verbs <- bind_rows(dat_combined_de_postagged_verbs,
                                          dat_combined_en_postagged_verbs)

nrow(dat_combined_postagged_verbs)
nrow(dat_combined)


# only filter sentences that contain a verb

dat_combined_verbs <- dat_combined %>% 
  filter(doc_id %in% dat_combined_postagged_verbs$doc_id) 

nrow(dat_combined_verbs)

# estimate percentage decrease of corpus after only keeping sentences 
# containing a verb
(nrow(dat_combined) - nrow(dat_combined_verbs)) / nrow(dat_combined)


# emove table of contents and make column for bullet points --------

string_tocs <- ".\\.\\.\\.\\.\\.*|_________*|--------*"

dat_combined_verbs <- dat_combined_verbs %>% 
    mutate(check_sentence_toc = stringr::str_detect(text, string_tocs)) 

table(dat_combined_verbs$check_sentence_toc)

## remove sentences that are part of the TOC
dat_combined_verbs <- dat_combined_verbs %>% 
    filter(check_sentence_toc == FALSE) 

# create variable for decade and a nicer language variable
dat_combined_verbs <- dat_combined_verbs %>% 
    mutate(decade = paste0(substr(year, 1, 3), "0s")) %>% 
    mutate(language_capital = stringi::stri_trans_totitle(language))

# select only the three class probability as a new data frame
# and determine which class has the highest probability
dat_classes <- select(dat_combined_verbs, Past, Present, Future)

dat_combined_verbs$class_probability <- apply(dat_classes, 1, max)

# assign the class with the highest probability as a column
dat_combined_verbs$class <- colnames(dat_classes)[max.col(dat_classes, ties.method = "first")]

# create variable that counts the number of sentences in each class per manifesto
dat_combined_verbs <- dat_combined_verbs %>% 
  group_by(countryname, party, date, class) %>% 
  mutate(number_sentences_class = n()) %>% 
  ungroup()  

# create better variable for party family based on CMP data
dat_combined_verbs <- dat_combined_verbs %>% 
  mutate(party_family = car::recode(parfam, "'10'='Ecological';
                                    '20'='Socialist';
                                    '30'='Social democratic';
                                    '40'='Liberal';
                                    '50'='Christian democratic';
                                    '60'='Conservative';
                                    '70'='Nationalist';
                                    '80'='Agrarian';
                                    '90'='Ethnic and regional';
                                    '95'='Special issue';
                                    '98'='Electoral alliances'"))

# classify extremist parties (same coding scheme as Crabtree et al.))
dat_combined_verbs <- dat_combined_verbs %>% 
  mutate(extremist_party = ifelse(party_family %in% c("Nationalist", "Socialist"), "Extremist party", "Not extremist party"))

# remove unnecessary variables
dat_combined_verbs_select <- dat_combined_verbs %>% 
    select(-c(starts_with("doc_id..."),
              certain, starts_with("tentat"),
              starts_with("certain"),
              intpeace, progtype, testresult, 
              coderyear, coderid, eumember, 
              oecdmember, is_copy_of, handbook,
              has_eu_code, 
              document1, document2, 
              id_perm, LOCATION, cpds1))

# save data frame
saveRDS(dat_combined_verbs_select, "data_manifestos_classified.rds")

